Chow normalised dataset (GSE28475)

Note: Raw data can be found in the GEO entry, but decided to use already normalised version available in Gandal’s github repository for now.

Load data

datMeta =  read.csv('./../Data/Chow/Chow_GSE28475_datMeta.csv')
datExpr = read.delim('./../Data/Chow/Chow_GSE28475_quantile_normalized.txt')

# Filter datExpr columns
rownames(datExpr) = datExpr[,1]
datExpr = datExpr[,-1]
datExpr = datExpr[,seq(1, 65, by=2)]
colnames(datExpr) = gsub('[^0-9.-]','', colnames(datExpr))

idx = match(colnames(datExpr), datMeta$SampleID)
datMeta = datMeta[idx,]


# Annotate Probes
ensembl = useMart('ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
getinfo = c('illumina_humanref_8_v3', 'ensembl_gene_id', 'entrezgene', 'external_gene_id',
            'chromosome_name', 'start_position', 'end_position')
geneDat = getBM(attributes = getinfo, filters='illumina_humanref_8_v3', values=rownames(datExpr), mart=ensembl)
## Batch submitting query [>-------------------------] 4% eta: 4m Batch
## submitting query [=>------------------------] 6% eta: 5m Batch submitting
## query [=>------------------------] 8% eta: 5m Batch submitting query
## [==>-----------------------] 10% eta: 6m Batch submitting query
## [==>-----------------------] 12% eta: 6m Batch submitting query
## [===>----------------------] 14% eta: 6m Batch submitting query
## [===>----------------------] 16% eta: 6m Batch submitting query
## [====>---------------------] 18% eta: 6m Batch submitting query
## [====>---------------------] 20% eta: 6m Batch submitting query
## [=====>--------------------] 22% eta: 5m Batch submitting query
## [=====>--------------------] 24% eta: 5m Batch submitting query
## [======>-------------------] 27% eta: 5m Batch submitting query
## [======>-------------------] 29% eta: 5m Batch submitting query
## [=======>------------------] 31% eta: 5m Batch submitting query
## [=======>------------------] 33% eta: 5m Batch submitting query
## [========>-----------------] 35% eta: 5m Batch submitting query
## [=========>----------------] 37% eta: 5m Batch submitting query
## [=========>----------------] 39% eta: 5m Batch submitting query
## [==========>---------------] 41% eta: 4m Batch submitting query
## [==========>---------------] 43% eta: 4m Batch submitting query
## [===========>--------------] 45% eta: 4m Batch submitting query
## [===========>--------------] 47% eta: 4m Batch submitting query
## [============>-------------] 49% eta: 4m Batch submitting query
## [============>-------------] 51% eta: 4m Batch submitting query
## [=============>------------] 53% eta: 4m Batch submitting query
## [=============>------------] 55% eta: 3m Batch submitting query
## [==============>-----------] 57% eta: 3m Batch submitting query
## [==============>-----------] 59% eta: 3m Batch submitting query
## [===============>----------] 61% eta: 3m Batch submitting query
## [===============>----------] 63% eta: 3m Batch submitting query
## [================>---------] 65% eta: 3m Batch submitting query
## [=================>--------] 67% eta: 2m Batch submitting query
## [=================>--------] 69% eta: 2m Batch submitting query
## [==================>-------] 71% eta: 2m Batch submitting query
## [==================>-------] 73% eta: 2m Batch submitting query
## [===================>------] 76% eta: 2m Batch submitting query
## [===================>------] 78% eta: 2m Batch submitting query
## [====================>-----] 80% eta: 2m Batch submitting query
## [====================>-----] 82% eta: 1m Batch submitting query
## [=====================>----] 84% eta: 1m Batch submitting query
## [=====================>----] 86% eta: 1m Batch submitting query
## [======================>---] 88% eta: 1m Batch submitting query
## [======================>---] 90% eta: 46s Batch submitting query
## [=======================>--] 92% eta: 37s Batch submitting query
## [=======================>--] 94% eta: 28s Batch submitting query
## [========================>-] 96% eta: 19s Batch submitting query
## [========================>-] 98% eta: 9s Batch submitting query
## [==========================] 100% eta: 0s
idx = match(rownames(datExpr), geneDat$illumina_humanref_8_v3)
datGenes = cbind(rownames(datExpr), geneDat[idx,])

# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv') %>%
              filter(!is.na(ID))
## Parsed with column specification:
## cols(
##   status = col_double(),
##   `gene-symbol` = col_character(),
##   `gene-name` = col_character(),
##   chromosome = col_character(),
##   `genetic-category` = col_character(),
##   `gene-score` = col_double(),
##   syndromic = col_double(),
##   `number-of-reports` = col_double(),
##   ID = col_character()
## )
# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)


# Combine SFARI and GO information
gene_info = data.frame('ilmnID'=rownames(datExpr), 'ID' = datGenes$ensembl_gene_id) %>% 
            left_join(SFARI_genes, by='ID') %>% 
            mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
            left_join(GO_neuronal, by='ID') %>% 
            mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
            mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))


rm(idx, ensembl, getinfo, geneDat, GO_annotations)


Check sample distribution

RNA-Seq for 33 brain-tissue samples from the PFC, comprising 18 samples from control subjects and 15 from ASD subjects

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject)), ' different subjects.'))
## [1] "Dataset includes 24356 genes from 33 samples belonging to 33 different subjects."


Diagnosis distribution: Fairly balanced

table(datMeta$DX)
## 
##  Autism Control 
##      15      18


Brain region distribution: All samples belong to the PFC

Sex distribution: All samples are males

table(datMeta$SEX)
## 
##  F  M 
##  0 33


Age distribution: Subjects between 2 and 56 years old with a mean close to 20

summary(datMeta$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    8.00   15.00   20.39   30.00   56.00



Filtering

  1. Filter genes with low expression levels

Seems like there is a change in behaviour around a mean expression of 9, although we would lose most of the probes if we filtered all genes below that threshold

mean_expr = data.frame('ID'=rownames(datExpr),'Mean'=rowMeans(datExpr, na.rm=T),
                       'SD'=apply(datExpr,1,function(x) sd(x, na.rm=T)))

p1 = mean_expr %>% ggplot(aes(Mean, SD)) + geom_point(alpha=0.01, color='#0099cc') + geom_smooth(method='lm', color='gray') +
     geom_vline(xintercept=6, color='gray', linetype='dashed') + theme_minimal()

p2 = mean_expr %>% ggplot(aes(Mean)) + geom_density(fill='#0099cc', color='#0099cc', alpha=0.5) + 
     geom_vline(xintercept=6, color='gray', linetype='dashed') + theme_minimal()

grid.arrange(p1, p2, ncol=2)

rm(mean_expr, p1, p2)

Filtering out genes with a mean expression lower than 6

to_keep = rowMeans(datExpr, na.rm=TRUE)>6
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep, na.rm=T), ', ', sum(to_keep, na.rm=T), ' remaining'))
## [1] "Removed 2530, 21826 remaining"
rm(to_keep)


  1. Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'Diagnosis'=datMeta$DX)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Diagnosis)) + geom_point() + 
         geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: 51, 52, 75"
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 3 samples, 30 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 21826 genes and 30 samples"





Differential Expression Analysis

Batch characterisation

Batch variable

table(datMeta$Batch, datMeta$DX, useNA='ifany')
##       
##        Autism Control
##   1         3       4
##   2         8      14
##   <NA>      1       0

Remove sample without Batch variable

to_keep = !is.na(datMeta$Batch)
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]

Looking for unknown sources of batch effects with sva

Using sva instead of svaseq because it’s microarray data

mod = model.matrix(~ DX, datMeta)
mod0 = model.matrix(~ 1, datMeta)
sva_fit = datExpr %>% as.matrix %>% sva(mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0)

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))

datMeta = cbind(datMeta, sv_data)

rm(sv_data)



Differential Expression Analysis

Using lmFit

mod = model.matrix(~ Batch + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + DX, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$SampleID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$sampleid, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr)) %>% mutate('ilmnID'=rownames(.))

DE_info = gene_info %>% left_join(top_genes, by='ilmnID')

rm(mod, corfit, lmfit, fit, top_genes)



Visualisations


Samples

PCA: Autism samples tend to cluster in the center and there may be an age related pattern in the 1st PC, but neither of these behaviours are very strong

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=as.numeric(colnames(datExpr)), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by=c('ID'='SampleID')) %>% 
            dplyr::select('PC1','PC2','AGE','DX')

selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
rm(pca, plot_data)


Genes

  • First Principal Component explains 92% of the total variance

  • There’s a really strong (negative) correlation between the mean expression of a gene and the 1st principal component

pca = datExpr %>% prcomp

plot_data = data.frame('PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3], 
                       'MeanExpr'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.2) + theme_minimal() + 
     scale_color_viridis() + ggtitle('PCA') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)




SFARI scores

Mean and SD

The higher the SFARI score the higher the mean expression but the lower the standard deviation (although this second relation is weaker). Same as Gandal’s dataset!

plot_data = data.frame('ID'=rownames(datExpr),
                       'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(DE_info, by=c('ID'='ilmnID'))

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)


Log Fold Change

Except for score 6, there doesn’t seem to be any relation between logFC and SFARI score

ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(logFC), fill=gene.score)) + 
         geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal() + theme(legend.position='none'))
## Warning: Removed 2530 rows containing non-finite values (stat_boxplot).




Save preprocessed dataset

save(datExpr, datMeta, datGenes, DE_info, file='./../Data/Chow/cleaned_data.RData')
#load('./../Data/Gupta/preprocessed_data.RData')



Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] illuminaHumanv4.db_1.26.0 org.Hs.eg.db_3.7.0       
##  [3] AnnotationDbi_1.44.0      IRanges_2.16.0           
##  [5] S4Vectors_0.20.1          limma_3.38.3             
##  [7] glue_1.3.1                sva_3.30.1               
##  [9] BiocParallel_1.16.6       genefilter_1.64.0        
## [11] mgcv_1.8-26               nlme_3.1-137             
## [13] vsn_3.50.0                Biobase_2.42.0           
## [15] BiocGenerics_0.28.0       WGCNA_1.66               
## [17] fastcluster_1.1.25        dynamicTreeCut_1.63-1    
## [19] forcats_0.3.0             stringr_1.4.0            
## [21] dplyr_0.8.0.1             purrr_0.3.1              
## [23] readr_1.3.1               tidyr_0.8.3              
## [25] tibble_2.1.1              tidyverse_1.2.1          
## [27] viridis_0.5.1             viridisLite_0.3.0        
## [29] gridExtra_2.3             plotlyutils_0.0.0.9000   
## [31] plotly_4.8.0              ggplot2_3.1.0            
## [33] biomaRt_2.38.0           
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1      htmlTable_1.11.2      base64enc_0.1-3      
##   [4] rstudioapi_0.7        affyio_1.52.0         bit64_0.9-7          
##   [7] mvtnorm_1.0-7         lubridate_1.7.4       xml2_1.2.0           
##  [10] codetools_0.2-15      splines_3.5.2         doParallel_1.0.14    
##  [13] impute_1.56.0         robustbase_0.93-0     knitr_1.22           
##  [16] Formula_1.2-3         jsonlite_1.6          Cairo_1.5-9          
##  [19] annotate_1.60.1       broom_0.5.1           cluster_2.0.7-1      
##  [22] GO.db_3.7.0           shiny_1.2.0           BiocManager_1.30.4   
##  [25] rrcov_1.4-3           compiler_3.5.2        httr_1.4.0           
##  [28] backports_1.1.2       assertthat_0.2.1      Matrix_1.2-15        
##  [31] lazyeval_0.2.2        cli_1.1.0             later_0.8.0          
##  [34] acepack_1.4.1         htmltools_0.3.6       prettyunits_1.0.2    
##  [37] tools_3.5.2           gtable_0.2.0          affy_1.60.0          
##  [40] Rcpp_1.0.1            cellranger_1.1.0      preprocessCore_1.44.0
##  [43] crosstalk_1.0.0       iterators_1.0.9       xfun_0.5             
##  [46] rvest_0.3.2           mime_0.6              statmod_1.4.30       
##  [49] XML_3.98-1.11         DEoptimR_1.0-8        MASS_7.3-51.1        
##  [52] zlibbioc_1.28.0       scales_1.0.0          promises_1.0.1       
##  [55] hms_0.4.2             RColorBrewer_1.1-2    curl_3.3             
##  [58] yaml_2.2.0            memoise_1.1.0         rpart_4.1-13         
##  [61] latticeExtra_0.6-28   stringi_1.4.3         RSQLite_2.1.1        
##  [64] pcaPP_1.9-73          foreach_1.4.4         checkmate_1.8.5      
##  [67] rlang_0.3.2           pkgconfig_2.0.2       matrixStats_0.54.0   
##  [70] bitops_1.0-6          evaluate_0.13         lattice_0.20-38      
##  [73] labeling_0.3          htmlwidgets_1.3       bit_1.1-14           
##  [76] tidyselect_0.2.5      robust_0.4-18         plyr_1.8.4           
##  [79] magrittr_1.5          R6_2.4.0              generics_0.0.2       
##  [82] Hmisc_4.1-1           fit.models_0.5-14     DBI_1.0.0            
##  [85] pillar_1.3.1          haven_1.1.1           foreign_0.8-71       
##  [88] withr_2.1.2           survival_2.43-3       RCurl_1.95-4.10      
##  [91] nnet_7.3-12           modelr_0.1.4          crayon_1.3.4         
##  [94] rmarkdown_1.12        progress_1.2.0        grid_3.5.2           
##  [97] readxl_1.1.0          data.table_1.12.0     blob_1.1.1           
## [100] digest_0.6.18         xtable_1.8-3          httpuv_1.5.0         
## [103] munsell_0.5.0